# -*-coding=ansi-*-
#抓取metalslime雪球文章
__author__ = 'Rocky '+'dergrey'
import requests,re,json,time
from toolkit import Toolkit
from lxml import etree

session = requests.session()
agent = 'Mozilla/5.0 (Windows NT 5.1; rv:33.0) Gecko/20100101 Firefox/33.0'
headers = {'Host': 'xueqiu.com',
           'Referer': 'https://xueqiu.com/',
           'Origin':'https://xueqiu.com',
           'User-Agent': agent}

usr='2805899520'        ##metalslime
##usr='4293366915'        ##test-my id

fo = open("artical_old.txt")
artical_old = fo.read(10)
for i in range(1,1500):
        fav='https://xueqiu.com/u/'+usr
        collection=session.get(fav,headers=headers)
        fav_content= collection.text
        temp=re.compile('"last_status_id":(\d+)')
        ##re.match只匹配字符串的开始，而re.search匹配整个字符串，直到找到一个匹配。
        artical_id=temp.findall(fav_content)
##        artical_id[0]='203637484'
        print(artical_old)
        print(artical_id[0])

        if artical_old != artical_id[0]:
                artical_old = artical_id[0]
                url='https://xueqiu.com/'+usr+'/'+artical_id[0]
                txt_content=session.get(url,headers=headers).text
                tree=etree.HTML(txt_content)
                title=tree.xpath('//title/text()')[0]
                filename = re.sub('[-\/:*?"<>雪球|]','', title)

                temp=re.compile('text":"(.*?)","')
                content=temp.findall(txt_content)
##                print(txt_content)
##                print(content[0])

                Toolkit.save2file(filename+".html",content[0])
                Toolkit.save2file("artical_old.txt",artical_id[0])
        time.sleep(80)
